library(ggplot2); theme_set(theme_bw())
## requires ggplot2 version >= 4.0.0
library(lme4)
library(glmmTMB)
library(lmerTest)
library(ggplot2)
library(tidyverse)
library(plotly)
library(buildmer)
library(broom.mixed)
library(bbmle)
library(DHARMa)
library(effects)
You may want to get previously fitted models (this uses the
much-maligned attach() function)
dd <- readRDS("data/ecoreg.rds")
mod_list <- readRDS("outputs/mod_list.rds")
attach(mod_list)
buildmer_list <- readRDS("outputs/buildmer.rds")
attach(buildmer_list)
confint_list <- readRDS("outputs/confint.rds")
attach(confint_list)
confint_tidy <- readRDS("outputs/confint_tidy.rds")
(These time-consuming fits etc. are actually done using
lecture2_batch.R)
From Moritz et al. (2023): 620 global observations
Sampling locations/ecoregions:
par(las=1, bty = "l")
plot(y ~ x, data = dd, xlab = "longitude", ylab = "latitude")
(not bothering to do a proper projection, land boundaries etc.)
dd <- readRDS("data/ecoreg.rds")
Draw figures: ggplot is particular good for its
flexibility — you can easily switch the way you incorporate different
predictors (as facets, colours, etc.).
geom_ellipse or
ggalt::geom_encircle() …)Competitors with similar capabilities include tinyplot
(R): plotnine, seaborn (Python);
gadfly, AlgebraOfGraphic (Julia), but
ggplot is mature (e.g. see ggplot extensions
gallery)
gg0 <- ggplot(dd, aes(NPP_log_sc, mbirds_log, colour = biome, shape = flor_realms)) +
geom_point() +
geom_smooth(method = "lm", aes(group = biome_FR), alpha = 0.1)
print(gg0)
Organize plots by biome or floristic realm:
gg1 <- gg0 + facet_wrap(~biome)
gg2 <- gg0 + facet_wrap(~flor_realms)
With pop-up tooltips (nice for identifying points interactively):
ggplotly(gg2)
(could also try out ggiraph for this?)
This is the full model used in Moritz et al. (2023). The fixed effects include all two-way interactions between the covariates of interest (NPP, Fire, annual coefficients of variation of each, all appropriately scaled and/or log-transformed). Random effects include main effects only, varying at the three levels of realms (e.g. “Afrotropics”, biomes (e.g. “Tropical grassland”), and their interaction (e.g. “Tropical grasslands in the Afrotropics”). We didn’t even try to fit interactions; these covariance matrices are already 5x5 (15 parameters each …)
I actually used text manipulation to set up the formula, so I could build lots of variations on the model. This makes the code more compact and reproducible (less cutting and pasting), at the expense of transparency.
Here I’m analyzing just the bird data (mbirds_log); for
the paper we analyzed diversity of birds, mammals and amphibians
separately.
form <- mbirds_log ~ (NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc)^2 +
(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc | biome) +
(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc | flor_realms) +
(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc | biome_FR)
Fit the model and take a first glance at the RE covariance matrix:
m_full <- lmer(form, data = dd)
(about 10 seconds)
VarCorr(m_full)
## Groups Name Std.Dev. Corr
## biome_FR (Intercept) 0.107644
## NPP_log_sc 0.122015 0.055
## Feat_log_sc 0.024896 -0.975 -0.276
## NPP_cv_sc 0.034689 -0.999 -0.015 0.965
## Feat_cv_sc 0.061427 0.981 0.247 -1.000 -0.972
## biome (Intercept) 0.125732
## NPP_log_sc 0.158137 -0.470
## Feat_log_sc 0.013087 -0.996 0.389
## NPP_cv_sc 0.091421 -0.512 0.999 0.434
## Feat_cv_sc 0.044459 0.901 -0.806 -0.859 -0.834
## flor_realms (Intercept) 0.236549
## NPP_log_sc 0.130949 -0.346
## Feat_log_sc 0.120903 0.608 -0.955
## NPP_cv_sc 0.032681 0.012 0.908 -0.764
## Feat_cv_sc 0.077954 0.357 -0.293 0.360 0.045
## Residual 0.196065
An example of text manipulation, used to illustrate more maximal models (4-way interactions in the fixed effects, two-way interactions in the random effects terms)
vars <- c("NPP_log_sc", "Feat_log_sc", "NPP_cv_sc", "Feat_cv_sc")
all_vars <- paste(vars, collapse = "+")
max_form <- sprintf("(%s)^4 + ((%s)^2 | biome) + ((%s)^2 | flor_realms) + ((%s)^2 | biome_FR)",
all_vars, all_vars, all_vars, all_vars)
max_form2 <- reformulate(max_form, response = "mbirds_log")
Fit the ridiculously full model (use calc.derivs=FALSE
to skip gradient/Hessian checking, speeding things up)
m_fullmax <- lmer(max_form2, data = dd, control = lmerControl(calc.derivs = FALSE))
(this model takes about 100 minutes to fit …)
Trying out the buildmer package to automatically reduce
the model:
m_buildmer <- buildmer(form, data = dd)
Decides on intercepts only at each stage. Unfortunately it also does model selection on the fixed effects by default, which I strongly disagree with (for philosophy about model selection/averaging see Bolker (2024))
Use include to force buildmer to keep all
of the fixed-effect terms:
m_buildmer_fix <- buildmer(form, data = dd,
buildmerControl =
buildmerControl(
include= ~(NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc)^2))
For birds and amphibians, the selected model included independent effects of the predictors at the biome/realm interaction level and intercept-only (mean diversity) variation at the biome and realm levels; for mammals, the (independent) predictor effects were included at the realm level, with mean- diversity effects at the biome and biome/realm levels.
In Moritz et al. (2023), we fitted all combinations of [intercept-only, diagonal, full] at each level [biome, realm, biome × realm]: 3 choices at each of three hierarchical levels → 81 sub-models to fit. We chose the minimum-AIC, non-singular model, which turned out to be a diagonal covariance matrix at the (biome×realm) level and intercepts only at the other two levels:
form2 <- mbirds_log ~ (NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc)^2 +
(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc || biome_FR) +
(1 | biome) +
(1 | flor_realms)
m_moritz <- lmer(form2, data = dd)
For comparison, let’s try this in glmmTMB as well:
m_glmmTMB <- glmmTMB(form2, data = dd, REML = TRUE)
Check the individual VarCorr terms:
sapply(VarCorr(m_full), det)
## biome_FR biome flor_realms
## -1.613694e-41 3.538280e-43 -6.052761e-33
All three terms are singular (determinant \(\ll 1\)), so let’s replace them all with diagonal versions:
## string-processing a formula is usually a no-no but I'm in a hurry ...
form_diag <- as.formula(gsub("|", "||", deparse1(form), fixed = TRUE))
m_kliegl1 <- lmer(form_diag, data = dd)
## boundary (singular) fit: see help('isSingular')
## use internal machinery for brevity
lme4:::.prt.VC(VarCorr(m_kliegl1), digits = 3)
## Random effects:
## Groups Name Std.Dev.
## biome_FR Feat_cv_sc 0.0725
## biome_FR.1 NPP_cv_sc 0.0746
## biome_FR.2 Feat_log_sc 0.0316
## biome_FR.3 NPP_log_sc 0.0883
## biome_FR.4 (Intercept) 0.1285
## biome Feat_cv_sc 0.0308
## biome.1 NPP_cv_sc 0.0000
## biome.2 Feat_log_sc 0.0000
## biome.3 NPP_log_sc 0.1321
## biome.4 (Intercept) 0.1219
## flor_realms Feat_cv_sc 0.0670
## flor_realms.1 NPP_cv_sc 0.0000
## flor_realms.2 Feat_log_sc 0.0959
## flor_realms.3 NPP_log_sc 0.0918
## flor_realms.4 (Intercept) 0.2201
## Residual 0.1955
This suggests we should drop NPP_cv_sc from the
flor_realms term; NPP_cv_sc and
Feat_log_sc from biome; and keep all terms for
biome_FR
form_kliegl2 <- mbirds_log ~
(NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc)^2 +
(1 + NPP_log_sc + Feat_cv_sc || biome) +
(1 + NPP_log_sc + Feat_log_sc + Feat_cv_sc || flor_realms) +
(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc || biome_FR)
m_kliegl2 <- lmer(form_kliegl2, data = dd)
isSingular(m_kliegl2)
## [1] FALSE
Can we get away with full covariances once we drop these terms?
form_kliegl3 <- as.formula(gsub("||", "|", deparse1(form_kliegl2), fixed = TRUE))
m_kliegl3 <- lmer(form_kliegl3, data = dd)
## boundary (singular) fit: see help('isSingular')
## Warning: Model failed to converge with 1 negative eigenvalue: -7.1e-01
sapply(VarCorr(m_kliegl3), det)
## biome_FR biome flor_realms
## 2.477975e-27 7.076794e-16 0.000000e+00
form_kliegl4 <- mbirds_log ~
(NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc)^2 +
rr(1 + NPP_log_sc + Feat_cv_sc | biome, d=2) +
rr(1 + NPP_log_sc + Feat_log_sc + Feat_cv_sc | flor_realms, d=3) +
rr(1 + NPP_log_sc + Feat_log_sc + NPP_cv_sc + Feat_cv_sc | biome_FR, d=4)
m_kliegl4 <- glmmTMB(form_kliegl4, dd, REML=TRUE)
mod_list <- tibble::lst(m_moritz, m_full, m_fullmax,
m_kliegl4,
m_buildmer_mod = m_buildmer@model,
m_buildmer_fix_mod = m_buildmer_fix@model,
m_glmmTMB)
odir <- "outputs"
if (!dir.exists(odir)) dir.create(odir)
saveRDS(mod_list, file = file.path(odir, "mod_list.rds"))
saveRDS(tibble::lst(m_buildmer, m_buildmer_fix),
file = file.path(odir, "buildmer.rds"))
performance::check_model(m_moritz)
pp1 <- DHARMa::simulateResiduals(m_moritz)
## plot(pp1)
pp2 <- DHARMa::simulateResiduals(m_moritz, use.u = TRUE)
plot(pp2)
It can be useful to plot against particular covariates, and to turn off the automatic scaling/ranking:
plotResiduals(pp1, form = dd$NPP_log_sc)
plotResiduals(pp1, form = dd$NPP_log_sc, rank = FALSE)
bbmle::AICtab(mod_list)
## dAIC df
## m_buildmer_mod 0.0 11
## m_glmmTMB 31.5 19
## m_moritz 31.5 19
## m_kliegl4 43.5 40
## m_buildmer_fix_mod 44.6 16
## m_full 72.0 57
## m_fullmax 334.6 215
Downstream processing (contrasts, marginal effects, plotting, etc.):
effects, broom.mixed, sjPlot,
marginaleffects, ggeffects,
emmeans (see Mixed
Models Task View)
You must (??) decide on a model before comparing parameter estimates — this step is for understanding sensitivity, not for choosing which model to use.
tt <- mod_list |>
purrr::map_dfr(tidy, effects = "fixed", .id = "model", conf.int = TRUE) |>
filter(term != "(Intercept)")
gg_coef <- ggplot(tt, aes(estimate, term, colour = model)) +
geom_pointrange(aes(xmin = conf.low, xmax = conf.high),
position = position_dodge(width = 0.75)) +
geom_vline(xintercept = 0, lty = 2) +
labs(y = "")
print(gg_coef)
Drop “fullmax” model:
mod_list2 <- mod_list[names(mod_list) != "m_fullmax"]
tt2 <- mod_list2 |>
purrr::map_dfr(tidy, effects = "fixed", .id = "model", conf.int = TRUE) |>
filter(term != "(Intercept)")
print(gg_coef + tt2)
ci_wald <- confint(m_moritz, method = "Wald", signames = FALSE)
ci_profile <- confint(m_moritz, method = "profile", signames = FALSE)
ci_boot <- confint(m_moritz, method = "boot", nsim = 1000, parallel = "multicore", ncpus = 6, signames = FALSE)
confint_list <- tibble::lst(ci_wald, ci_profile, ci_boot)
confint_tidy <- confint_list |>
map(as.data.frame) |>
map(~rownames_to_column(., var = "term")) |>
bind_rows(.id = "method") |>
mutate(effect = ifelse(grepl("^(sd|cor|sigma)", term), "random", "fixed")) |>
rename(lwr = "2.5 %", upr = "97.5 %") |>
filter(term != "(Intercept)")
ggplot(confint_tidy, aes(xmin = lwr, xmax = upr, y = term, colour = method)) +
geom_linerange(position = position_dodge(width = 0.75)) +
geom_vline(lty = 2, xintercept = 0) +
facet_wrap(~effect, scale = "free")
‘caterpillar plots’
cowplot::plot_grid(plotlist = lattice::dotplot(ranef(m_moritz)))
Effects plots, with partial residuals (also see the
remef package, on GitHub).
For conditional and marginal predictions/effects plots, see also
emmeans/sjPlot/ggeffects/marginaleffects
packages …
plot(allEffects(m_buildmer_fix_mod, partial.residuals = TRUE))
Would like to try marginaleffects but struggling …